*Progetto Data Mining*
Questo dataset contiene circa 10 anni di osservazioni meteo gornaliere da tutta l'Australia. Contiene in particolare dati sulle giornate di pioggia e di sole di tutto il paese. Contiene 23 colonne.
L'obbiettivo del progetto consiste nel fare predizione sulla colonna 'RainTomorrow'
from pandas import DataFrame, Series
from io import StringIO
import pandas as pd
import numpy as np
import os
import matplotlib
import matplotlib.pyplot as plt
import plotly.express as px
from IPython.display import Image
import seaborn as sb
import statistics as stat
import time
import math
from sklearn.metrics import *
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler
from sklearn.metrics import *
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder
from imblearn.over_sampling import SMOTE
def describe(a):
if type(a) is np.ndarray:
print("data:\n{}\nshape:{}\ndtype:{}\ntype: {}".format(a, a.shape, a.dtype, type(a)))
elif type(a) is pd.Series:
print("data:\n{}\nshape:{}\ndtype:{}\nname:{}\nindex-name:{}\ntype:{}".format(a, a.shape, a.dtype, a.name, a.index.name, type(a)))
elif type(a) is pd.DataFrame:
print("data:\n{}\nshape:{}\ntype:{}".format(a, a.shape,type(a)))
else:
print("{}, type:{}".format(a, type(a)))
dataFrameWeather = pd.read_csv('./content/drive/MyDrive/weatherAUS.csv')
dataFrameWeather.shape
(145460, 23)
dataFrameWeather.describe()
| MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustSpeed | WindSpeed9am | WindSpeed3pm | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 143975.000000 | 144199.000000 | 142199.000000 | 82670.000000 | 75625.000000 | 135197.000000 | 143693.000000 | 142398.000000 | 142806.000000 | 140953.000000 | 130395.00000 | 130432.000000 | 89572.000000 | 86102.000000 | 143693.000000 | 141851.00000 |
| mean | 12.194034 | 23.221348 | 2.360918 | 5.468232 | 7.611178 | 40.035230 | 14.043426 | 18.662657 | 68.880831 | 51.539116 | 1017.64994 | 1015.255889 | 4.447461 | 4.509930 | 16.990631 | 21.68339 |
| std | 6.398495 | 7.119049 | 8.478060 | 4.193704 | 3.785483 | 13.607062 | 8.915375 | 8.809800 | 19.029164 | 20.795902 | 7.10653 | 7.037414 | 2.887159 | 2.720357 | 6.488753 | 6.93665 |
| min | -8.500000 | -4.800000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 980.50000 | 977.100000 | 0.000000 | 0.000000 | -7.200000 | -5.40000 |
| 25% | 7.600000 | 17.900000 | 0.000000 | 2.600000 | 4.800000 | 31.000000 | 7.000000 | 13.000000 | 57.000000 | 37.000000 | 1012.90000 | 1010.400000 | 1.000000 | 2.000000 | 12.300000 | 16.60000 |
| 50% | 12.000000 | 22.600000 | 0.000000 | 4.800000 | 8.400000 | 39.000000 | 13.000000 | 19.000000 | 70.000000 | 52.000000 | 1017.60000 | 1015.200000 | 5.000000 | 5.000000 | 16.700000 | 21.10000 |
| 75% | 16.900000 | 28.200000 | 0.800000 | 7.400000 | 10.600000 | 48.000000 | 19.000000 | 24.000000 | 83.000000 | 66.000000 | 1022.40000 | 1020.000000 | 7.000000 | 7.000000 | 21.600000 | 26.40000 |
| max | 33.900000 | 48.100000 | 371.000000 | 145.000000 | 14.500000 | 135.000000 | 130.000000 | 87.000000 | 100.000000 | 100.000000 | 1041.00000 | 1039.600000 | 9.000000 | 9.000000 | 40.200000 | 46.70000 |
dataFrameWeather.duplicated().value_counts()
False 145460 Name: count, dtype: int64
dataFrameWeather.sample(frac=1).head(30)
| Date | Location | MinTemp | MaxTemp | Rainfall | Evaporation | Sunshine | WindGustDir | WindGustSpeed | WindDir9am | ... | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | RainToday | RainTomorrow | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 143707 | 2017-01-02 | Katherine | 23.2 | 31.0 | 69.0 | NaN | NaN | NNW | 28.0 | NNE | ... | 96.0 | NaN | 1000.2 | 998.0 | 8.0 | NaN | 25.5 | NaN | Yes | Yes |
| 89605 | 2015-09-30 | Cairns | 17.8 | 29.1 | 0.0 | NaN | NaN | ESE | 37.0 | S | ... | 58.0 | 43.0 | 1021.9 | 1018.5 | 3.0 | NaN | 23.9 | 27.8 | No | No |
| 78022 | 2009-02-03 | Watsonia | 19.6 | 30.4 | 1.0 | 4.2 | 8.4 | S | 37.0 | SSE | ... | 71.0 | 43.0 | 1012.7 | 1009.9 | 7.0 | 3.0 | 22.6 | 28.6 | No | No |
| 108269 | 2016-08-28 | Woomera | 12.5 | 23.9 | 0.0 | 9.6 | NaN | NNE | 67.0 | N | ... | 28.0 | 17.0 | 1017.6 | 1011.1 | 6.0 | 6.0 | 15.0 | 23.3 | No | No |
| 32616 | 2015-01-04 | Sydney | 22.4 | 30.4 | 0.0 | 10.2 | 8.9 | SSW | 52.0 | E | ... | 69.0 | 66.0 | 1015.9 | 1012.8 | 1.0 | 7.0 | 26.9 | 27.1 | No | No |
| 43335 | 2011-01-28 | Wollongong | 21.0 | 23.6 | 0.0 | NaN | NaN | S | 43.0 | SSE | ... | 76.0 | 64.0 | 1019.9 | 1019.8 | 5.0 | NaN | 21.6 | 23.1 | No | No |
| 107971 | 2015-11-04 | Woomera | 17.4 | 26.6 | 3.2 | 6.0 | 5.8 | WNW | 61.0 | N | ... | 90.0 | 52.0 | 1006.5 | 1004.8 | 7.0 | 6.0 | 19.2 | 24.3 | Yes | No |
| 81160 | 2009-06-12 | Dartmoor | 2.9 | 11.7 | 0.0 | 1.6 | 2.4 | NNE | 50.0 | NNE | ... | 82.0 | 72.0 | 1018.6 | 1014.4 | NaN | NaN | 6.2 | 11.4 | No | Yes |
| 63253 | 2014-12-01 | Sale | 16.2 | 28.7 | 0.0 | NaN | NaN | S | 35.0 | NNE | ... | 61.0 | 78.0 | 1004.2 | 1005.4 | 6.0 | 7.0 | 24.2 | 20.7 | No | Yes |
| 129518 | 2016-08-10 | Walpole | 9.1 | 16.4 | 5.6 | NaN | NaN | NW | 44.0 | NNW | ... | 99.0 | 64.0 | 1020.7 | 1018.5 | NaN | NaN | 10.7 | 15.4 | Yes | No |
| 107042 | 2013-04-19 | Woomera | 17.9 | 26.2 | NaN | NaN | NaN | E | 41.0 | E | ... | 41.0 | 25.0 | 1021.3 | 1018.7 | 7.0 | NaN | 19.8 | 24.9 | NaN | NaN |
| 79989 | 2014-09-21 | Watsonia | 7.8 | 17.5 | 0.0 | 2.8 | 9.6 | WSW | 30.0 | E | ... | 61.0 | 54.0 | 1031.6 | 1029.2 | 7.0 | 1.0 | 12.8 | 17.2 | No | No |
| 136328 | 2009-08-15 | AliceSprings | 21.3 | 32.9 | 0.0 | 9.2 | 10.9 | N | 56.0 | NNW | ... | 22.0 | 14.0 | 1016.1 | 1011.6 | 3.0 | 3.0 | 25.7 | 31.7 | No | No |
| 99407 | 2017-03-12 | Adelaide | 16.9 | 21.7 | 1.4 | NaN | NaN | WSW | 43.0 | NW | ... | 78.0 | 69.0 | 1004.8 | 1005.5 | NaN | NaN | 18.7 | 20.0 | Yes | Yes |
| 118941 | 2012-09-04 | PerthAirport | 8.1 | 15.8 | 32.2 | 3.8 | 4.1 | WSW | 81.0 | WSW | ... | 64.0 | 62.0 | 1006.8 | 1009.8 | 7.0 | 7.0 | 12.3 | 15.0 | Yes | Yes |
| 50339 | 2012-08-09 | Tuggeranong | 1.8 | 12.9 | 0.0 | NaN | NaN | W | 59.0 | W | ... | 56.0 | 58.0 | 1015.1 | 1012.3 | NaN | NaN | 9.8 | 8.6 | No | No |
| 81028 | 2009-01-31 | Dartmoor | NaN | NaN | NaN | 8.6 | 9.7 | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
| 88747 | 2013-05-25 | Cairns | 18.0 | 25.3 | 0.0 | 4.8 | 11.0 | SSE | 54.0 | S | ... | 47.0 | 48.0 | 1017.8 | 1014.6 | 1.0 | 1.0 | 21.1 | 23.9 | No | No |
| 64565 | 2010-01-10 | MelbourneAirport | 14.7 | 32.0 | 0.0 | 11.4 | 13.2 | S | 30.0 | S | ... | 69.0 | 31.0 | 1017.8 | 1014.9 | 1.0 | 1.0 | 19.5 | 30.6 | No | No |
| 112626 | 2011-11-12 | Witchcliffe | 7.4 | 21.9 | 0.0 | NaN | NaN | W | 44.0 | W | ... | 66.0 | 69.0 | 1019.7 | 1017.7 | NaN | NaN | 18.5 | 18.3 | No | No |
| 28485 | 2012-09-10 | Richmond | 1.9 | 26.7 | 0.0 | 13.6 | NaN | NE | 35.0 | NNE | ... | 70.0 | 19.0 | 1025.8 | 1018.8 | NaN | NaN | 12.7 | 26.3 | No | No |
| 115331 | 2010-12-13 | PearceRAAF | 12.9 | 29.1 | 0.0 | NaN | 13.3 | SW | 48.0 | SW | ... | 50.0 | 36.0 | 1006.9 | 1007.4 | 0.0 | 0.0 | 23.9 | 27.2 | No | No |
| 99552 | 2009-01-09 | MountGambier | 6.4 | 21.6 | 0.0 | 5.2 | 9.9 | S | 43.0 | ESE | ... | 52.0 | 41.0 | 1021.7 | 1017.7 | 7.0 | 6.0 | 15.3 | 20.8 | No | No |
| 71562 | 2012-04-14 | Mildura | 9.2 | 28.0 | 0.0 | 3.8 | 9.4 | NW | 30.0 | NE | ... | 59.0 | 21.0 | 1024.6 | 1020.2 | 6.0 | 2.0 | 16.1 | 27.5 | No | No |
| 55880 | 2011-01-18 | Ballarat | 10.4 | 17.4 | 0.2 | NaN | NaN | SSW | 35.0 | SW | ... | 88.0 | 65.0 | 1010.8 | 1009.5 | 8.0 | 8.0 | 12.0 | 16.1 | No | No |
| 2420 | 2015-10-15 | Albury | 10.5 | 32.4 | 0.0 | NaN | NaN | NNW | 39.0 | SE | ... | 72.0 | 32.0 | 1023.6 | 1018.8 | NaN | NaN | 19.1 | 31.1 | No | No |
| 11257 | 2015-04-08 | CoffsHarbour | 9.6 | 23.3 | 0.0 | NaN | NaN | N | 37.0 | NW | ... | 50.0 | 49.0 | 1015.5 | 1013.2 | NaN | NaN | 18.9 | 23.0 | No | No |
| 127801 | 2011-09-30 | Walpole | 8.6 | 18.4 | 0.2 | NaN | NaN | ESE | 33.0 | ENE | ... | 77.0 | 67.0 | 1020.5 | 1017.4 | NaN | NaN | 14.0 | 16.7 | No | No |
| 123485 | 2016-07-15 | Perth | 10.8 | 22.0 | 0.0 | 3.2 | 1.8 | NNW | 31.0 | NNE | ... | 50.0 | 41.0 | 1017.3 | 1015.1 | 7.0 | 7.0 | 15.6 | 21.0 | No | No |
| 100593 | 2011-12-16 | MountGambier | 8.7 | 28.0 | 0.0 | 4.2 | 12.8 | S | 41.0 | SE | ... | 62.0 | 42.0 | 1018.7 | 1015.2 | 0.0 | 1.0 | 18.8 | 27.1 | No | No |
30 rows × 23 columns
dataFrameWeather.isna().sum()
Date 0 Location 0 MinTemp 1485 MaxTemp 1261 Rainfall 3261 Evaporation 62790 Sunshine 69835 WindGustDir 10326 WindGustSpeed 10263 WindDir9am 10566 WindDir3pm 4228 WindSpeed9am 1767 WindSpeed3pm 3062 Humidity9am 2654 Humidity3pm 4507 Pressure9am 15065 Pressure3pm 15028 Cloud9am 55888 Cloud3pm 59358 Temp9am 1767 Temp3pm 3609 RainToday 3261 RainTomorrow 3267 dtype: int64
Come possiamo notare il dataframe non presenta dati duplicati ma presenta molti dati NaN che andranno gestiti. SI inizia trattando i valori di temperatura minima e massima. Questi possono essere semplicemente essere riempiti con i dati di temperatura media della rispettiva colonna in quanto non presentano un elevato numero di tuple nulle.
import warnings
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(figsize=(10, 10))
dataFrameWeather.hist(ax=ax)
plt.subplots_adjust(right=1.2, top=1.2)
plt.show()
Inizio con il riempimento dei dati nan. Parto con 'MinTemp' e 'MaxTemp'. Essendo pochi i dati mancanti, essi verranno sostituiti con la temperatura media di ciascuna colonna.
mean_MinTemp = dataFrameWeather['MinTemp'].mean()
mean_MaxTemp = dataFrameWeather['MaxTemp'].mean()
dataFrameWeather['MinTemp'].fillna(mean_MinTemp, inplace=True)
dataFrameWeather['MaxTemp'].fillna(mean_MaxTemp, inplace=True)
dataFrameWeather.isna().sum()
Date 0 Location 0 MinTemp 0 MaxTemp 0 Rainfall 3261 Evaporation 62790 Sunshine 69835 WindGustDir 10326 WindGustSpeed 10263 WindDir9am 10566 WindDir3pm 4228 WindSpeed9am 1767 WindSpeed3pm 3062 Humidity9am 2654 Humidity3pm 4507 Pressure9am 15065 Pressure3pm 15028 Cloud9am 55888 Cloud3pm 59358 Temp9am 1767 Temp3pm 3609 RainToday 3261 RainTomorrow 3267 dtype: int64
Andiamo ora a gestire il dato Rainfall. Esso indica se la quantità di acqua caduta in un giorno 'RainToday' = 'Yes' se e solo se la quantità di acqua caduta è strettamente maggiore di 1mm di acqua.
Si decide di droppare le tuple che contengono valori NaN in quanto la correzione richiederebbe troppo tempo per essere implementata visto il numero esiguo di tuple
Si decide di droppare le colonne 'WindDir3pm' and 'WindDir9pm' perchè risultano essere irrilevanti ed essere una specifica di una colonna già esistente. Stessa cosa con 'WindSpeed9am', 'WinSpeed3pm'
dataFrameWeather = dataFrameWeather.drop(columns=['WindDir3pm', 'WindDir9am', 'WindSpeed3pm', 'WindSpeed9am'], axis=1)
dataFrameWeather.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 145460 entries, 0 to 145459 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 145460 non-null object 1 Location 145460 non-null object 2 MinTemp 145460 non-null float64 3 MaxTemp 145460 non-null float64 4 Rainfall 142199 non-null float64 5 Evaporation 82670 non-null float64 6 Sunshine 75625 non-null float64 7 WindGustDir 135134 non-null object 8 WindGustSpeed 135197 non-null float64 9 Humidity9am 142806 non-null float64 10 Humidity3pm 140953 non-null float64 11 Pressure9am 130395 non-null float64 12 Pressure3pm 130432 non-null float64 13 Cloud9am 89572 non-null float64 14 Cloud3pm 86102 non-null float64 15 Temp9am 143693 non-null float64 16 Temp3pm 141851 non-null float64 17 RainToday 142199 non-null object 18 RainTomorrow 142193 non-null object dtypes: float64(14), object(5) memory usage: 21.1+ MB
Posso, inoltre, droppare le tuple mancanti nella colonna 'RainTomorrow' essendo di un numero molto esiguo ed essendo questo il parametro che vogliamo andare a predire
dataFrameWeather.dropna(subset='Rainfall', inplace=True)
dataFrameWeather.isna().sum()
Date 0 Location 0 MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 60488 Sunshine 67820 WindGustDir 9725 WindGustSpeed 9665 Humidity9am 1554 Humidity3pm 3630 Pressure9am 13940 Pressure3pm 13993 Cloud9am 53331 Cloud3pm 56874 Temp9am 685 Temp3pm 2746 RainToday 0 RainTomorrow 1412 dtype: int64
dataFrameWeather.dropna(subset='RainTomorrow', inplace=True)
plt.figure(figsize=(5,6))
ax = sb.countplot(x='RainTomorrow', data=dataFrameWeather, palette="Set1")
plt.show()
dataFrameWeather.info()
<class 'pandas.core.frame.DataFrame'> Index: 140787 entries, 0 to 145458 Data columns (total 19 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 140787 non-null object 1 Location 140787 non-null object 2 MinTemp 140787 non-null float64 3 MaxTemp 140787 non-null float64 4 Rainfall 140787 non-null float64 5 Evaporation 81093 non-null float64 6 Sunshine 73982 non-null float64 7 WindGustDir 131624 non-null object 8 WindGustSpeed 131682 non-null float64 9 Humidity9am 139270 non-null float64 10 Humidity3pm 137286 non-null float64 11 Pressure9am 127044 non-null float64 12 Pressure3pm 127018 non-null float64 13 Cloud9am 88162 non-null float64 14 Cloud3pm 84693 non-null float64 15 Temp9am 140131 non-null float64 16 Temp3pm 138163 non-null float64 17 RainToday 140787 non-null object 18 RainTomorrow 140787 non-null object dtypes: float64(14), object(5) memory usage: 21.5+ MB
Si continua con la sostituizione ai valori nan delle colonne 'Humidity9am', 'Humidity3pm', 'Temp9am', 'Temp3pm'. Qui si andrà ad effettuare una group-by sull'attributo 'RainToday', così da poter differenziare la media calcolata per giorni in cui piove e per giorni in cui non piove.
average_humidity = dataFrameWeather.groupby('RainToday')['Humidity9am'].transform('mean')
average_humidity2 = dataFrameWeather.groupby('RainToday')['Humidity3pm'].transform('mean')
average_temp = dataFrameWeather.groupby('RainToday')['Temp9am'].transform('mean')
average_temp2 = dataFrameWeather.groupby('RainToday')['Temp3pm'].transform('mean')
dataFrameWeather['Humidity9am'].fillna(average_humidity, inplace=True)
dataFrameWeather['Humidity3pm'].fillna(average_humidity2, inplace=True)
dataFrameWeather['Temp9am'].fillna(average_temp, inplace=True)
dataFrameWeather['Temp3pm'].fillna(average_temp2, inplace=True)
dataFrameWeather.isna().sum()
Date 0 Location 0 MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 59694 Sunshine 66805 WindGustDir 9163 WindGustSpeed 9105 Humidity9am 0 Humidity3pm 0 Pressure9am 13743 Pressure3pm 13769 Cloud9am 52625 Cloud3pm 56094 Temp9am 0 Temp3pm 0 RainToday 0 RainTomorrow 0 dtype: int64
plt.figure(figsize=(15,10))
plt.subplot(2, 2, 1)
fig = dataFrameWeather.Rainfall.hist(bins=10)
fig.set_xlabel('Rainfall')
fig.set_ylabel('RainTomorrow')
plt.subplot(2, 2, 2)
fig = dataFrameWeather.WindGustSpeed.hist(bins=10)
fig.set_xlabel('WinGustSpeed')
fig.set_ylabel('RainTomorrow')
plt.subplot(2, 2, 3)
fig = dataFrameWeather.Temp3pm.hist(bins=10)
fig.set_xlabel('Temp3pm')
fig.set_ylabel('RainTomorrow')
plt.subplot(2, 2, 4)
fig = dataFrameWeather.Humidity3pm.hist(bins=10)
fig.set_xlabel('Humidity')
fig.set_ylabel('RainTomorrow')
Text(0, 0.5, 'RainTomorrow')
Si decide di droppare le colonne Evaporation e Sunshine in quanto sono quelle con la più bassa correlazione con gli altri attributi
dataFrameWeather = dataFrameWeather.drop(columns=['Sunshine'], axis=1)
dataFrameWeather.info()
<class 'pandas.core.frame.DataFrame'> Index: 140787 entries, 0 to 145458 Data columns (total 18 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 140787 non-null object 1 Location 140787 non-null object 2 MinTemp 140787 non-null float64 3 MaxTemp 140787 non-null float64 4 Rainfall 140787 non-null float64 5 Evaporation 81093 non-null float64 6 WindGustDir 131624 non-null object 7 WindGustSpeed 131682 non-null float64 8 Humidity9am 140787 non-null float64 9 Humidity3pm 140787 non-null float64 10 Pressure9am 127044 non-null float64 11 Pressure3pm 127018 non-null float64 12 Cloud9am 88162 non-null float64 13 Cloud3pm 84693 non-null float64 14 Temp9am 140787 non-null float64 15 Temp3pm 140787 non-null float64 16 RainToday 140787 non-null object 17 RainTomorrow 140787 non-null object dtypes: float64(13), object(5) memory usage: 20.4+ MB
dataFrameWeather.isna().sum()
Date 0 Location 0 MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 59694 WindGustDir 9163 WindGustSpeed 9105 Humidity9am 0 Humidity3pm 0 Pressure9am 13743 Pressure3pm 13769 Cloud9am 52625 Cloud3pm 56094 Temp9am 0 Temp3pm 0 RainToday 0 RainTomorrow 0 dtype: int64
Trasformo, a questo punto, la data dal formato gg-mm-aaaa in tre colonne diverse contenenti il giorno, il mese e l'anno. Questa soluzione sarà utile in seguito per fare imputation
df_copy = dataFrameWeather.copy()
# Aggiungo la colonna "month" al dataframe
df_copy['day'] = df_copy.Date.str.split("-", expand = True)[0]
df_copy['month'] = df_copy.Date.str.split("-", expand = True)[1]
# Aggiungo la colonna "year" al dataframe
df_copy['year'] = df_copy.Date.str.split("-", expand = True)[2]
df_copy = df_copy.drop(columns='Date', axis=1)
df_copy.isna().sum()
Location 0 MinTemp 0 MaxTemp 0 Rainfall 0 Evaporation 59694 WindGustDir 9163 WindGustSpeed 9105 Humidity9am 0 Humidity3pm 0 Pressure9am 13743 Pressure3pm 13769 Cloud9am 52625 Cloud3pm 56094 Temp9am 0 Temp3pm 0 RainToday 0 RainTomorrow 0 day 0 month 0 year 0 dtype: int64
df_num = df_copy.select_dtypes(include=[np.number])
df_cat = df_copy.select_dtypes(include=['object'])
Uso un encoder per trasformare i dati categorici in dati numerici automaticamente
#df_encoded = pd.concat([df_cat, df_num], axis=1)
for attr in df_cat.columns:
df_cat[attr] = LabelEncoder().fit_transform(df_cat[attr])
df_encoded = pd.concat([df_cat, df_num], axis = 1)
df_encoded.head()
| Location | WindGustDir | RainToday | RainTomorrow | day | month | year | MinTemp | MaxTemp | Rainfall | Evaporation | WindGustSpeed | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2 | 13 | 0 | 0 | 1 | 11 | 0 | 13.4 | 22.9 | 0.6 | NaN | 44.0 | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.0 | NaN | 16.9 | 21.8 |
| 1 | 2 | 14 | 0 | 0 | 1 | 11 | 1 | 7.4 | 25.1 | 0.0 | NaN | 44.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | NaN | NaN | 17.2 | 24.3 |
| 2 | 2 | 15 | 0 | 0 | 1 | 11 | 2 | 12.9 | 25.7 | 0.0 | NaN | 46.0 | 38.0 | 30.0 | 1007.6 | 1008.7 | NaN | 2.0 | 21.0 | 23.2 |
| 3 | 2 | 4 | 0 | 0 | 1 | 11 | 3 | 9.2 | 28.0 | 0.0 | NaN | 24.0 | 45.0 | 16.0 | 1017.6 | 1012.8 | NaN | NaN | 18.1 | 26.5 |
| 4 | 2 | 13 | 0 | 0 | 1 | 11 | 4 | 17.5 | 32.3 | 1.0 | NaN | 41.0 | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.0 | 8.0 | 17.8 | 29.7 |
Data l'elevato numero di tuple nan rimanenti nelle colonne si è deciso di utilizzare un imputer. Un imputer è una tecnica utilizzata nella fase di preprocessing dei dati per gestire i valori mancanti in un dataset. L'imputer viene utilizzato per riempire i valori mancanti con valori appropriati in base alle caratteristiche dei dati.
df_encoded.info()
<class 'pandas.core.frame.DataFrame'> Index: 140787 entries, 0 to 145458 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Location 140787 non-null int32 1 WindGustDir 140787 non-null int32 2 RainToday 140787 non-null int32 3 RainTomorrow 140787 non-null int32 4 day 140787 non-null int32 5 month 140787 non-null int32 6 year 140787 non-null int32 7 MinTemp 140787 non-null float64 8 MaxTemp 140787 non-null float64 9 Rainfall 140787 non-null float64 10 Evaporation 81093 non-null float64 11 WindGustSpeed 131682 non-null float64 12 Humidity9am 140787 non-null float64 13 Humidity3pm 140787 non-null float64 14 Pressure9am 127044 non-null float64 15 Pressure3pm 127018 non-null float64 16 Cloud9am 88162 non-null float64 17 Cloud3pm 84693 non-null float64 18 Temp9am 140787 non-null float64 19 Temp3pm 140787 non-null float64 dtypes: float64(13), int32(7) memory usage: 18.8 MB
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
df_imputed = df_encoded.copy()
df_imputed = imputer.fit_transform(df_imputed) #imputation dei valori mancanti
df_imputed = pd.DataFrame(df_imputed, columns=df_encoded.columns) #conversione in dataframe pandas
df_imputed.head(100)
| Location | WindGustDir | RainToday | RainTomorrow | day | month | year | MinTemp | MaxTemp | Rainfall | Evaporation | WindGustSpeed | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp9am | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 2.0 | 13.0 | 0.0 | 0.0 | 1.0 | 11.0 | 0.0 | 13.4 | 22.9 | 0.6 | 5.978670 | 44.0 | 71.0 | 22.0 | 1007.7 | 1007.1 | 8.000000 | 5.039727 | 16.9 | 21.8 |
| 1 | 2.0 | 14.0 | 0.0 | 0.0 | 1.0 | 11.0 | 1.0 | 7.4 | 25.1 | 0.0 | 6.242761 | 44.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | 1.776385 | 2.605029 | 17.2 | 24.3 |
| 2 | 2.0 | 15.0 | 0.0 | 0.0 | 1.0 | 11.0 | 2.0 | 12.9 | 25.7 | 0.0 | 8.273388 | 46.0 | 38.0 | 30.0 | 1007.6 | 1008.7 | 2.037233 | 2.000000 | 21.0 | 23.2 |
| 3 | 2.0 | 4.0 | 0.0 | 0.0 | 1.0 | 11.0 | 3.0 | 9.2 | 28.0 | 0.0 | 6.242160 | 24.0 | 45.0 | 16.0 | 1017.6 | 1012.8 | 1.371820 | 2.029181 | 18.1 | 26.5 |
| 4 | 2.0 | 13.0 | 0.0 | 0.0 | 1.0 | 11.0 | 4.0 | 17.5 | 32.3 | 1.0 | 7.194498 | 41.0 | 82.0 | 33.0 | 1010.8 | 1006.0 | 7.000000 | 8.000000 | 17.8 | 29.7 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 95 | 2.0 | 10.0 | 0.0 | 0.0 | 2.0 | 2.0 | 7.0 | 11.0 | 30.2 | 0.0 | 6.895401 | 24.0 | 54.0 | 20.0 | 1017.0 | 1014.7 | 1.857876 | 2.275389 | 17.6 | 28.8 |
| 96 | 2.0 | 5.0 | 0.0 | 0.0 | 2.0 | 2.0 | 8.0 | 13.8 | 31.8 | 0.0 | 7.556472 | 24.0 | 49.0 | 28.0 | 1019.7 | 1015.9 | 2.302235 | 2.598391 | 18.6 | 30.5 |
| 97 | 2.0 | 4.0 | 0.0 | 1.0 | 2.0 | 2.0 | 9.0 | 15.5 | 32.0 | 0.0 | 8.374073 | 50.0 | 51.0 | 25.0 | 1019.5 | 1016.2 | 3.150445 | 3.638528 | 20.1 | 30.8 |
| 98 | 2.0 | 4.0 | 1.0 | 0.0 | 2.0 | 2.0 | 10.0 | 18.4 | 30.5 | 1.2 | 7.175351 | 44.0 | 57.0 | 23.0 | 1021.3 | 1018.0 | 3.344974 | 2.861642 | 21.5 | 29.6 |
| 99 | 2.0 | 13.0 | 0.0 | 1.0 | 2.0 | 2.0 | 11.0 | 20.9 | 25.7 | 0.0 | 6.882608 | 37.0 | 52.0 | 90.0 | 1019.5 | 1018.9 | 6.130625 | 8.000000 | 22.2 | 18.8 |
100 rows × 20 columns
df_imputed.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140787 entries, 0 to 140786 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Location 140787 non-null float64 1 WindGustDir 140787 non-null float64 2 RainToday 140787 non-null float64 3 RainTomorrow 140787 non-null float64 4 day 140787 non-null float64 5 month 140787 non-null float64 6 year 140787 non-null float64 7 MinTemp 140787 non-null float64 8 MaxTemp 140787 non-null float64 9 Rainfall 140787 non-null float64 10 Evaporation 140787 non-null float64 11 WindGustSpeed 140787 non-null float64 12 Humidity9am 140787 non-null float64 13 Humidity3pm 140787 non-null float64 14 Pressure9am 140787 non-null float64 15 Pressure3pm 140787 non-null float64 16 Cloud9am 140787 non-null float64 17 Cloud3pm 140787 non-null float64 18 Temp9am 140787 non-null float64 19 Temp3pm 140787 non-null float64 dtypes: float64(20) memory usage: 21.5 MB
Come si può vedere l'encoder ha trasformato i dati che prima erano int in float. Castizzo quindi queste colonne in dati int nuovamente
df_imputed['Location'] = df_imputed['Location'].astype(int)
df_imputed['WindGustDir'] = df_imputed['WindGustDir'].astype(int)
df_imputed['RainTomorrow'] = df_imputed['RainTomorrow'].astype(int)
df_imputed['RainToday'] = df_imputed['RainToday'].astype(int)
df_imputed['day'] = df_imputed['day'].astype(int)
df_imputed['month'] = df_imputed['month'].astype(int)
df_imputed['year'] = df_imputed['year'].astype(int)
df_imputed.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 140787 entries, 0 to 140786 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Location 140787 non-null int32 1 WindGustDir 140787 non-null int32 2 RainToday 140787 non-null int32 3 RainTomorrow 140787 non-null int32 4 day 140787 non-null int32 5 month 140787 non-null int32 6 year 140787 non-null int32 7 MinTemp 140787 non-null float64 8 MaxTemp 140787 non-null float64 9 Rainfall 140787 non-null float64 10 Evaporation 140787 non-null float64 11 WindGustSpeed 140787 non-null float64 12 Humidity9am 140787 non-null float64 13 Humidity3pm 140787 non-null float64 14 Pressure9am 140787 non-null float64 15 Pressure3pm 140787 non-null float64 16 Cloud9am 140787 non-null float64 17 Cloud3pm 140787 non-null float64 18 Temp9am 140787 non-null float64 19 Temp3pm 140787 non-null float64 dtypes: float64(13), int32(7) memory usage: 17.7 MB
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(figsize=(10, 10))
dataFrameWeather.hist(ax=ax)
plt.subplots_adjust(right=1.2, top=1.2)
plt.show()
num = ['Humidity3pm', 'Humidity9am', 'Temp3pm', 'MinTemp', 'MaxTemp', 'Temp9am']
sb.pairplot(dataFrameWeather[num], kind='scatter', diag_kind='hist', palette='Rainbow')
plt.show()
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
# Standardize the features
scaler = StandardScaler()
df_scaled = scaler.fit_transform(df_imputed)
# Create DBSCAN object
dbscan = DBSCAN(eps=1.7, min_samples=4)
# Perform clustering
clusters = dbscan.fit_predict(df_scaled)
# Add the cluster labels to the DataFrame
df_imputed['Cluster'] = clusters
df_imputed[df_imputed['Cluster'] == -1].sum()
#df_no_outliers = df_imputed[clusters != -1]
# Visualize the clusters
plt.scatter(df_imputed['Temp9am'], df_imputed['Temp3pm'], c=df_imputed['Cluster'], cmap='viridis')
plt.xlabel('Temp9am')
plt.ylabel('Temp3pm')
plt.title('DBSCAN Clustering')
plt.show()
df_imputed = df_imputed[clusters != -1]
plt.scatter(df_imputed['Temp9am'], df_imputed['Temp3pm'], c=df_imputed['Cluster'], cmap='viridis')
plt.xlabel('Temp9am')
plt.ylabel('Temp3pm')
plt.title('DBSCAN Clustering')
plt.show()
df_imputed = df_imputed.drop(columns=['Cluster'], axis=1)
Plotto la matrice di correlazione servendoci di una heatmap per capire il grado di correlazione tra i vari dati
#creo la matrice di correlazione
matr_corr = df_imputed.corr()
#stampo la matrice di correlazione
plt.figure(figsize=(15,15))
sb.heatmap(matr_corr, annot=True, cmap='coolwarm', fmt=".2f")
plt.show()
Da una veloce analisi della matrice di correlazione possiamo notare come alcuni attributi sono bassamente correlati agli altri. Questo ci permette di dropparli senza problemi così da ridurre il numero di features da tenere in considerazione
Andiamo adesso a studiare l'importanza delle feature. Sappiamo di voler fare predizione sull'attributo 'RainTomorrow', andiamo quindi a creare un RandomForest che ci permetterà di capire quali sono gli attributi più importanti a riguardo.
df_X = df_imputed.drop(['RainTomorrow'], axis=1)
df_Y = df_imputed['RainTomorrow']
forest = RandomForestClassifier()
forest.fit(df_X, df_Y)
attributi= df_X.columns
importances= forest.feature_importances_ #grado di importanza fra gli attributi
index= np.argsort(importances) #mappa ogni grado di importanza con l'attributo corrispondente
plt.figure(figsize=(10,10))
plt.title('Grado di importanza fra gli attributi')
plt.barh(range(len(index)),importances[index],color='r',align='center')
plt.yticks(range(len(index)),attributi[index])
plt.show()
df_imputed.shape
(107145, 20)
df_imputed.drop(['day', 'month', 'year', 'Location', 'WindGustDir', 'Temp9am', 'MinTemp'], axis=1)
| RainToday | RainTomorrow | MaxTemp | Rainfall | Evaporation | WindGustSpeed | Humidity9am | Humidity3pm | Pressure9am | Pressure3pm | Cloud9am | Cloud3pm | Temp3pm | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 0 | 25.1 | 0.0 | 6.242761 | 44.0 | 44.0 | 25.0 | 1010.6 | 1007.8 | 1.776385 | 2.605029 | 24.3 |
| 2 | 0 | 0 | 25.7 | 0.0 | 8.273388 | 46.0 | 38.0 | 30.0 | 1007.6 | 1008.7 | 2.037233 | 2.000000 | 23.2 |
| 3 | 0 | 0 | 28.0 | 0.0 | 6.242160 | 24.0 | 45.0 | 16.0 | 1017.6 | 1012.8 | 1.371820 | 2.029181 | 26.5 |
| 5 | 0 | 0 | 29.7 | 0.2 | 7.845996 | 56.0 | 55.0 | 23.0 | 1009.2 | 1005.4 | 3.018157 | 2.790305 | 28.9 |
| 6 | 0 | 0 | 25.0 | 0.0 | 7.791964 | 50.0 | 49.0 | 19.0 | 1009.6 | 1008.2 | 1.000000 | 1.831301 | 24.6 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 140782 | 0 | 0 | 21.8 | 0.0 | 5.301208 | 31.0 | 59.0 | 27.0 | 1024.7 | 1021.2 | 2.574186 | 2.859711 | 20.9 |
| 140783 | 0 | 0 | 23.4 | 0.0 | 5.794565 | 31.0 | 51.0 | 24.0 | 1024.6 | 1020.3 | 1.809039 | 2.504704 | 22.4 |
| 140784 | 0 | 0 | 25.3 | 0.0 | 5.885369 | 22.0 | 56.0 | 21.0 | 1023.5 | 1019.1 | 1.663488 | 2.125562 | 24.5 |
| 140785 | 0 | 0 | 26.9 | 0.0 | 6.959350 | 37.0 | 53.0 | 24.0 | 1021.0 | 1016.8 | 1.944675 | 2.446405 | 26.1 |
| 140786 | 0 | 0 | 27.0 | 0.0 | 7.357367 | 28.0 | 51.0 | 24.0 | 1019.4 | 1016.5 | 3.000000 | 2.000000 | 26.0 |
107145 rows × 13 columns
Visto l'elevatissimo numero di tuple si decide di fare campionamento sul dataframe così da poter lavorare con un numero di tuple minore
#df_balanced_sampled = df_imputed.sample(n = 100110, random_state=35) #n e random_state sono dei numeri generati casualmente
df_balanced_sampled = df_imputed.copy()
df_balanced_sampled.info()
<class 'pandas.core.frame.DataFrame'> Index: 107145 entries, 1 to 140786 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Location 107145 non-null int32 1 WindGustDir 107145 non-null int32 2 RainToday 107145 non-null int32 3 RainTomorrow 107145 non-null int32 4 day 107145 non-null int32 5 month 107145 non-null int32 6 year 107145 non-null int32 7 MinTemp 107145 non-null float64 8 MaxTemp 107145 non-null float64 9 Rainfall 107145 non-null float64 10 Evaporation 107145 non-null float64 11 WindGustSpeed 107145 non-null float64 12 Humidity9am 107145 non-null float64 13 Humidity3pm 107145 non-null float64 14 Pressure9am 107145 non-null float64 15 Pressure3pm 107145 non-null float64 16 Cloud9am 107145 non-null float64 17 Cloud3pm 107145 non-null float64 18 Temp9am 107145 non-null float64 19 Temp3pm 107145 non-null float64 dtypes: float64(13), int32(7) memory usage: 14.3 MB
df_balanced_sampled.shape
(107145, 20)
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(figsize=(10, 10))
df_balanced_sampled.hist(ax=ax)
plt.subplots_adjust(right=1.2, top=1.2)
plt.show()
Per poter proseguire con i primi classificatori bisogna prima bilanciare il dataset. In particolare l'attributo 'RainTomorrow' sulla quale bisogna fare predizione risulta essere molto sbilanciato da come si può vedere dal grafico sottostante
plt.figure(figsize=(4, 4))
ax = sb.countplot(x=df_imputed['RainTomorrow'])
plt.bar_label(ax.containers[0])
plt.show()
Ciò che verrà fatto è quello di utilizzare il sampling, in particolare verrà fatto oversampling sui dati per bilanciarli
X = df_balanced_sampled.drop(['RainTomorrow'], axis=1)
y = df_balanced_sampled['RainTomorrow']
#Utilizzo lo smote per fare oversampling e aumentare il numero di istanze 'RainTomorrow' = 'Yes' (1)
smote = SMOTE(random_state=35)
X_smoted, y_smoted = smote.fit_resample(X,y)
df_balanced = pd.concat([X_smoted, y_smoted],axis=1)
plt.figure(figsize=(4, 4))
ax = sb.countplot(x=df_balanced['RainTomorrow'])
plt.bar_label(ax.containers[0])
plt.show()
df_balanced.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 180204 entries, 0 to 180203 Data columns (total 20 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Location 180204 non-null int32 1 WindGustDir 180204 non-null int32 2 RainToday 180204 non-null int32 3 day 180204 non-null int32 4 month 180204 non-null int32 5 year 180204 non-null int32 6 MinTemp 180204 non-null float64 7 MaxTemp 180204 non-null float64 8 Rainfall 180204 non-null float64 9 Evaporation 180204 non-null float64 10 WindGustSpeed 180204 non-null float64 11 Humidity9am 180204 non-null float64 12 Humidity3pm 180204 non-null float64 13 Pressure9am 180204 non-null float64 14 Pressure3pm 180204 non-null float64 15 Cloud9am 180204 non-null float64 16 Cloud3pm 180204 non-null float64 17 Temp9am 180204 non-null float64 18 Temp3pm 180204 non-null float64 19 RainTomorrow 180204 non-null int32 dtypes: float64(13), int32(7) memory usage: 22.7 MB
Salvo il dataframe in un file .csv
df_balanced.to_csv('df_final.csv', index=False)
df_final = pd.read_csv('df_final.csv')
Finito il lavoro di pulizia, bilanciamento ed esplorazione del dataset si può passare alla fase di training dei classificatori. Dividiamo il dataset in training_set e test_set
#Divido il dataset in test_set e training_set
df_X = df_final.drop('RainTomorrow', axis=1)
df_y = df_final['RainTomorrow']
X_train,X_test,y_train,y_test = train_test_split(df_X, df_y, test_size=.33, random_state=42)
y_train = y_train.values.ravel()
albero = DecisionTreeClassifier(random_state=42, max_depth=18)
albero = albero.fit(X_train,y_train)
y_pred_train_albero = albero.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_albero))
precision recall f1-score support
0 0.99 0.99 0.99 60411
1 0.99 0.99 0.99 60325
accuracy 0.99 120736
macro avg 0.99 0.99 0.99 120736
weighted avg 0.99 0.99 0.99 120736
y_pred_test_albero=albero.predict(X_test)
#lista_predizioni.append(y_pred_test_albero)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_albero))
precision recall f1-score support
0 0.93 0.91 0.92 29691
1 0.91 0.93 0.92 29777
accuracy 0.92 59468
macro avg 0.92 0.92 0.92 59468
weighted avg 0.92 0.92 0.92 59468
nb_clf = GaussianNB()
nb_clf = nb_clf.fit(X_train, y_train)
y_pred_train_bayes = nb_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 60411
1 0.82 0.80 0.81 60325
accuracy 0.81 120736
macro avg 0.81 0.81 0.81 120736
weighted avg 0.81 0.81 0.81 120736
y_pred_test_bayes = nb_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_bayes)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 29691
1 0.82 0.80 0.81 29777
accuracy 0.81 59468
macro avg 0.81 0.81 0.81 59468
weighted avg 0.81 0.81 0.81 59468
logreg_clf = LogisticRegression(random_state=42,max_iter=5000) #aumento iterazioni per la convergenza dell'algoritmo
logreg_clf = logreg_clf.fit(X_train, y_train)
y_pred_train_logreg_clf = logreg_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 60411
1 0.83 0.85 0.84 60325
accuracy 0.84 120736
macro avg 0.84 0.84 0.84 120736
weighted avg 0.84 0.84 0.84 120736
y_pred_test_logreg_clf = logreg_clf.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 29691
1 0.83 0.85 0.84 29777
accuracy 0.84 59468
macro avg 0.84 0.84 0.84 59468
weighted avg 0.84 0.84 0.84 59468
##start= time.time()
Knn_clf = KNeighborsClassifier(n_neighbors=5)
Knn_clf = Knn_clf.fit(X_train,y_train)
#stop= time.time()-start
import warnings
warnings.filterwarnings('ignore')
#results= valuta_performance('KNN classifier', Knn_clf, X_train, y_train, results,stop)
y_pred_test_Knn_clf = Knn_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_Knn_clf)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_Knn_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_Knn_clf))
precision recall f1-score support
0 0.99 0.81 0.89 29691
1 0.84 0.99 0.91 29777
accuracy 0.90 59468
macro avg 0.91 0.90 0.90 59468
weighted avg 0.91 0.90 0.90 59468
forest = RandomForestClassifier()
forest = forest.fit(X_train,y_train)
y_pred_test_forest = forest.predict(X_test)
#lista_predizioni.append(y_pred_test_forest)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_forest)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_forest))
precision recall f1-score support
0 0.96 0.95 0.96 29691
1 0.95 0.96 0.96 29777
accuracy 0.96 59468
macro avg 0.96 0.96 0.96 59468
weighted avg 0.96 0.96 0.96 59468
#Divido il dataset in test_set e training_set
df_X = df_final.drop('RainTomorrow', axis=1)
df_y = df_final['RainTomorrow']
X_train,X_test,y_train,y_test = train_test_split(df_X, df_y, test_size=.33, random_state=42)
y_train = y_train.values.ravel()
albero = DecisionTreeClassifier(random_state=42, max_depth=18)
albero = albero.fit(X_train,y_train)
y_pred_train_albero = albero.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_albero))
precision recall f1-score support
0 0.99 0.99 0.99 60411
1 0.99 0.99 0.99 60325
accuracy 0.99 120736
macro avg 0.99 0.99 0.99 120736
weighted avg 0.99 0.99 0.99 120736
y_pred_test_albero=albero.predict(X_test)
#lista_predizioni.append(y_pred_test_albero)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_albero))
precision recall f1-score support
0 0.93 0.91 0.92 29691
1 0.91 0.93 0.92 29777
accuracy 0.92 59468
macro avg 0.92 0.92 0.92 59468
weighted avg 0.92 0.92 0.92 59468
nb_clf = GaussianNB()
nb_clf = nb_clf.fit(X_train, y_train)
y_pred_train_bayes = nb_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 60411
1 0.82 0.80 0.81 60325
accuracy 0.81 120736
macro avg 0.81 0.81 0.81 120736
weighted avg 0.81 0.81 0.81 120736
y_pred_test_bayes = nb_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_bayes)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 29691
1 0.82 0.80 0.81 29777
accuracy 0.81 59468
macro avg 0.81 0.81 0.81 59468
weighted avg 0.81 0.81 0.81 59468
logreg_clf = LogisticRegression(random_state=42,max_iter=5000) #aumento iterazioni per la convergenza dell'algoritmo
logreg_clf = logreg_clf.fit(X_train, y_train)
y_pred_train_logreg_clf = logreg_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 60411
1 0.83 0.85 0.84 60325
accuracy 0.84 120736
macro avg 0.84 0.84 0.84 120736
weighted avg 0.84 0.84 0.84 120736
y_pred_test_logreg_clf = logreg_clf.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 29691
1 0.83 0.85 0.84 29777
accuracy 0.84 59468
macro avg 0.84 0.84 0.84 59468
weighted avg 0.84 0.84 0.84 59468
##start= time.time()
Knn_clf = KNeighborsClassifier(n_neighbors=5)
Knn_clf = Knn_clf.fit(X_train,y_train)
#stop= time.time()-start
import warnings
warnings.filterwarnings('ignore')
#results= valuta_performance('KNN classifier', Knn_clf, X_train, y_train, results,stop)
y_pred_test_Knn_clf = Knn_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_Knn_clf)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_Knn_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_Knn_clf))
precision recall f1-score support
0 0.99 0.81 0.89 29691
1 0.84 0.99 0.91 29777
accuracy 0.90 59468
macro avg 0.91 0.90 0.90 59468
weighted avg 0.91 0.90 0.90 59468
forest = RandomForestClassifier()
forest = forest.fit(X_train,y_train)
y_pred_test_forest = forest.predict(X_test)
#lista_predizioni.append(y_pred_test_forest)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_forest)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_forest))
precision recall f1-score support
0 0.96 0.95 0.96 29691
1 0.95 0.96 0.96 29777
accuracy 0.96 59468
macro avg 0.96 0.96 0.96 59468
weighted avg 0.96 0.96 0.96 59468
#Divido il dataset in test_set e training_set
df_X = df_final.drop('RainTomorrow', axis=1)
df_y = df_final['RainTomorrow']
X_train,X_test,y_train,y_test = train_test_split(df_X, df_y, test_size=.33, random_state=42)
y_train = y_train.values.ravel()
albero = DecisionTreeClassifier(random_state=42)
albero = albero.fit(X_train,y_train)
y_pred_train_albero = albero.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_albero))
precision recall f1-score support
0 1.00 1.00 1.00 60411
1 1.00 1.00 1.00 60325
accuracy 1.00 120736
macro avg 1.00 1.00 1.00 120736
weighted avg 1.00 1.00 1.00 120736
y_pred_test_albero=albero.predict(X_test)
#lista_predizioni.append(y_pred_test_albero)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_albero)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_albero))
precision recall f1-score support
0 0.92 0.91 0.92 29691
1 0.91 0.93 0.92 29777
accuracy 0.92 59468
macro avg 0.92 0.92 0.92 59468
weighted avg 0.92 0.92 0.92 59468
nb_clf = GaussianNB()
nb_clf = nb_clf.fit(X_train, y_train)
y_pred_train_bayes = nb_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 60411
1 0.82 0.80 0.81 60325
accuracy 0.81 120736
macro avg 0.81 0.81 0.81 120736
weighted avg 0.81 0.81 0.81 120736
y_pred_test_bayes = nb_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_bayes)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_bayes)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_bayes))
precision recall f1-score support
0 0.81 0.82 0.81 29691
1 0.82 0.80 0.81 29777
accuracy 0.81 59468
macro avg 0.81 0.81 0.81 59468
weighted avg 0.81 0.81 0.81 59468
logreg_clf = LogisticRegression(random_state=42,max_iter=5000) #aumento iterazioni per la convergenza dell'algoritmo
logreg_clf = logreg_clf.fit(X_train, y_train)
y_pred_train_logreg_clf = logreg_clf.predict(X_train)
confusion_matrix = metrics.confusion_matrix(y_train,y_pred_train_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_train, y_pred_train_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 60411
1 0.83 0.85 0.84 60325
accuracy 0.84 120736
macro avg 0.84 0.84 0.84 120736
weighted avg 0.84 0.84 0.84 120736
y_pred_test_logreg_clf = logreg_clf.predict(X_test)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_logreg_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_logreg_clf))
precision recall f1-score support
0 0.85 0.83 0.84 29691
1 0.83 0.85 0.84 29777
accuracy 0.84 59468
macro avg 0.84 0.84 0.84 59468
weighted avg 0.84 0.84 0.84 59468
##start= time.time()
Knn_clf = KNeighborsClassifier(n_neighbors=5)
Knn_clf = Knn_clf.fit(X_train,y_train)
#stop= time.time()-start
import warnings
warnings.filterwarnings('ignore')
#results= valuta_performance('KNN classifier', Knn_clf, X_train, y_train, results,stop)
y_pred_test_Knn_clf = Knn_clf.predict(X_test)
#lista_predizioni.append(y_pred_test_Knn_clf)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_Knn_clf)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_Knn_clf))
precision recall f1-score support
0 0.99 0.81 0.89 29691
1 0.84 0.99 0.91 29777
accuracy 0.90 59468
macro avg 0.91 0.90 0.90 59468
weighted avg 0.91 0.90 0.90 59468
forest = RandomForestClassifier()
forest = forest.fit(X_train,y_train)
y_pred_test_forest = forest.predict(X_test)
#lista_predizioni.append(y_pred_test_forest)
confusion_matrix = metrics.confusion_matrix(y_test,y_pred_test_forest)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_matrix, display_labels = [0, 1])
fig, ax = plt.subplots(figsize=(3, 3))
cm_display.plot(ax=ax, values_format='d')
plt.show()
print(classification_report(y_test, y_pred_test_forest))
precision recall f1-score support
0 0.96 0.95 0.96 29691
1 0.95 0.96 0.96 29777
accuracy 0.96 59468
macro avg 0.96 0.96 0.96 59468
weighted avg 0.96 0.96 0.96 59468